Libraries

set.seed(895893)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
library(broom)
library(papaja)
## Loading required package: tinylabels

Introduction

This document outlines using the suggested procedure to simulate the number of participants necessary to accurately measure items. There are two key issues these ideas should address that we know about power:

  1. we should see differences in projected sample sizes based on the variability in the variance for those items (i.e., heterogeneity should increase projected sample size)
  2. we should see projected sample sizes that “level off” when pilot data increases. As with regular power estimates, studies can be “overpowered” to detect an effect, and this same idea should be present. For example, if one has a 500 person pilot study, our simulations should suggest a point at which items are likely measured well, which may have happened well before 500.

Method

Data Simulation

Population. The data was simulated using the rnorm function assuming a normal distribution for 30 rating scale type items rounded to two decimal places. The item means were simulated as \(\mu\) = 4 points with a \(\sigma\) of .25 points. The item \(\sigma\)s were simulated as 2 points with a low variability (\(\sigma\) = .2), medium variability (.4), and high variability (.8). Each population was simulated with 1000 data points.

# mu <- rnorm(30, 4, .25)
# sigma1 <- rnorm(30, 2, .2)
# sigma2 <- rnorm(30, 2, .4)
# sigma3 <- rnorm(30, 2, .8)

mu <- rnorm(30, 847.48, 150)
sigma1 <- rnorm(30, 350, 100)
sigma2 <- rnorm(30, 350, 150)
sigma3 <- rnorm(30, 350, 200)

population1 <- data.frame(
  item = rep(1:30, 1000), 
  score = rnorm(1000*30, mean = mu, sd = sigma1)
  )

population2 <- data.frame(
  item = rep(1:30, 1000), 
  score = rnorm(1000*30, mean = mu, sd = sigma2)
  )

population3 <- data.frame(
  item = rep(1:30, 1000), 
  score = rnorm(1000*30, mean = mu, sd = sigma3)
  )

#evidence that they are simulated correctly
tapply(population1$score, population1$item, mean) # around 4
##         1         2         3         4         5         6         7         8 
##  765.6036  664.7785  725.2771  807.4432  538.3648  900.3546  910.2568 1052.8264 
##         9        10        11        12        13        14        15        16 
##  630.9712  832.6903  420.4910  729.7713 1002.6954  829.4935  860.7070  767.0554 
##        17        18        19        20        21        22        23        24 
##  833.1047  902.2030 1075.1021  861.4978  878.1431 1041.9373  855.7506  937.2122 
##        25        26        27        28        29        30 
##  679.9668  764.6669  972.6437  767.2876  703.4890 1134.4645
tapply(population1$score, population1$item, sd) # around 2
##        1        2        3        4        5        6        7        8 
## 311.2448 360.3193 322.2985 338.7658 314.7784 287.8927 293.7953 420.0879 
##        9       10       11       12       13       14       15       16 
## 520.1255 225.5955 126.2305 260.6342 345.8144 235.1180 335.0544 378.1748 
##       17       18       19       20       21       22       23       24 
## 469.8720 449.9834 368.2411 272.1977 452.8620 242.6836 224.4292 368.2582 
##       25       26       27       28       29       30 
## 428.3773 276.0312 446.6273 347.6342 501.1615 132.2046
sd(tapply(population1$score, population1$item, sd)) # around .2
## [1] 99.311
tapply(population2$score, population2$item, mean) # around 4
##         1         2         3         4         5         6         7         8 
##  725.6040  674.2798  721.9513  819.0289  567.8612  908.0273  909.3387 1017.5575 
##         9        10        11        12        13        14        15        16 
##  633.3741  832.8185  425.6334  734.1888  991.4534  826.5033  896.6126  765.6925 
##        17        18        19        20        21        22        23        24 
##  816.4628  908.4821 1063.4440  871.5552  910.0602 1032.7541  858.2532  913.4935 
##        25        26        27        28        29        30 
##  689.3946  770.0179  952.0898  780.5229  744.3489 1148.2740
tapply(population2$score, population2$item, sd) # around 2
##        1        2        3        4        5        6        7        8 
## 628.7929 133.9378 180.8274 444.5191 534.7801 236.9617 374.8703 272.8363 
##        9       10       11       12       13       14       15       16 
## 533.0573 382.9958 510.5852 202.9281 384.3099 168.4936 486.3912 201.9486 
##       17       18       19       20       21       22       23       24 
## 510.6338 194.6759 361.3305 318.7460 290.1266 361.5759 382.1639 641.2955 
##       25       26       27       28       29       30 
## 427.6073 118.5997 572.2905 226.0813 319.1692 253.5639
sd(tapply(population2$score, population2$item, sd)) # around .4
## [1] 149.3125
tapply(population3$score, population3$item, mean) # around 4
##         1         2         3         4         5         6         7         8 
##  750.3565  653.3462  720.5342  826.4865  561.6759  887.3865  914.7876 1015.7814 
##         9        10        11        12        13        14        15        16 
##  628.8596  811.8146  440.8774  720.1106 1003.1874  819.4390  873.3109  774.0752 
##        17        18        19        20        21        22        23        24 
##  831.6783  921.3816 1054.2702  873.8773  912.0118 1044.3531  847.7998  917.2589 
##        25        26        27        28        29        30 
##  676.2947  749.4620  938.1922  736.1093  708.5602 1133.4742
tapply(population3$score, population3$item, sd) # around 2
##          1          2          3          4          5          6          7 
##  707.46915  521.55827  432.00590  413.38828  295.58972  327.81660  694.47902 
##          8          9         10         11         12         13         14 
##  333.32744  682.78489  373.40442  586.09688  716.46934  136.71397  429.01578 
##         15         16         17         18         19         20         21 
##   35.82413  243.38181  314.72626  509.47587  857.63676   63.15200  158.46518 
##         22         23         24         25         26         27         28 
##  437.23435  509.70987 1157.50955  276.46220  235.43245  538.14213  552.93501 
##         29         30 
##  324.61305  306.14171
sd(tapply(population3$score, population3$item, sd)) # around .8
## [1] 242.2529

Samples. Each population was then sampled as if a researcher was conducting a pilot study. The sample sizes started at 20 participants per item increasing in units of 5 up to 100 participants.

# create pilot samples from 20 to 100
samples1 <- samples2 <- samples3 <- list() 
sizes <- c(20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100)
for (i in 1:length(sizes)){
  samples1[[i]] <- population1 %>% 
    group_by(item) %>% 
    slice_sample(n = sizes[i])
  
  samples2[[i]] <- population2 %>% 
    group_by(item) %>% 
    slice_sample(n = sizes[i])
    
  samples3[[i]] <- population3 %>% 
    group_by(item) %>% 
    slice_sample(n = sizes[i])
}

AIPE Criterions. The standard errors of each item were calculated to mimic the AIPE procedure of finding an appropriately small confidence interval, as standard error functions as the main component in the formula for normal distribution confidence intervals. Standard errors were calculated at each decile of the items up to 90% (0% smallest SE, 10% …, 90% largest SE). The lower deciles would represent a strict criterion for accurate measurement, as many items would need smaller SEs to meet AIPE goals, while the higher deciles would represent less strict criterions for AIPE goals.

# calculate the SEs and the cutoff scores 
SES1 <- SES2 <- SES3 <- list()
cutoffs1 <- cutoffs2 <- cutoffs3 <- list()
for (i in 1:length(samples1)){
  SES1[[i]] <- tapply(samples1[[i]]$score,
                     samples1[[i]]$item,
                     function (x){ sd(x)/sqrt(length(x))})
  SES2[[i]] <- tapply(samples2[[i]]$score,
                   samples2[[i]]$item,
                   function (x){ sd(x)/sqrt(length(x))})
  SES3[[i]] <- tapply(samples3[[i]]$score,
                 samples3[[i]]$item,
                 function (x){ sd(x)/sqrt(length(x))})

  cutoffs1[[i]] <- quantile(SES1[[i]], probs = seq(0, .9, by = .1))
  cutoffs2[[i]] <- quantile(SES2[[i]], probs = seq(0, .9, by = .1))
  cutoffs3[[i]] <- quantile(SES3[[i]], probs = seq(0, .9, by = .1))
}

AIPE Simulation

In this section, we simulate what a researcher might do if they follow our suggested application of AIPE to sample size planning based on well measured items. Assuming each pilot sample represents a dataset a researcher has collected, we will simulate samples of 20 to 500 to determine what the new sample size suggestion would be. We assume that samples over 500 may be considered too large for many researchers who do not work in teams. The standard error of each item was calculated for each suggested sample size by pilot sample size by population type.

# sequence of sample sizes to try
samplesize_values <- seq(20, 1000, 5)

# place to store everything
sampled_values1 <- sampled_values2 <- sampled_values3 <- list()

# loop over the samples
for (i in 1:length(samples1)){
  
  # create a blank table for us to save the values in 
  sim_table1 <- matrix(NA, 
                      nrow = length(samplesize_values), 
                      ncol = 30)
  
  # make it a data frame
  sim_table1 <- sim_table2 <- sim_table3 <- as.data.frame(sim_table1)
  
  # add a place for sample size values 
  sim_table1$sample_size <- sim_table2$sample_size <- sim_table3$sample_size <- NA
  
  # loop over pilot sample sizes
  for (q in 1:length(samplesize_values)){
      
    # temp dataframe that samples and summarizes
    temp <- samples1[[i]] %>% 
      group_by(item) %>% 
      slice_sample(n = samplesize_values[q], replace = T) %>% 
      summarize(se = sd(score)/sqrt(length(score)))
    
    sim_table1[q, 1:30] <- temp$se
    sim_table1[q, 31] <- samplesize_values[q]
    
    temp <- samples2[[i]] %>% 
      group_by(item) %>% 
      slice_sample(n = samplesize_values[q], replace = T) %>% 
      summarize(se = sd(score)/sqrt(length(score)))
    
    sim_table2[q, 1:30] <- temp$se
    sim_table2[q, 31] <- samplesize_values[q]
    
    temp <- samples3[[i]] %>% 
      group_by(item) %>% 
      slice_sample(n = samplesize_values[q], replace = T) %>% 
      summarize(se = sd(score)/sqrt(length(score)))
    
    sim_table3[q, 1:30] <- temp$se
    sim_table3[q, 31] <- samplesize_values[q]
    
    } # end pilot sample loop 
  
  sampled_values1[[i]] <- sim_table1
  sampled_values2[[i]] <- sim_table2
  sampled_values3[[i]] <- sim_table3

} # end all sample loop 

Next, the percent of items that fall below the cutoff scores, and thus, would be considered “well-measured” were calculated for each decile by sample.

summary_list1 <- summary_list2 <- summary_list3 <- list()
for (i in 1:length(sampled_values1)){
  
  summary_list1[[i]] <- sampled_values1[[i]] %>% 
    pivot_longer(cols = -c(sample_size)) %>% 
    rename(item = name, se = value) %>% 
    group_by(sample_size) %>% 
    summarize(Percent_Below0 = sum(se <= cutoffs1[[i]][1])/30,
             Percent_Below10 = sum(se <= cutoffs1[[i]][2])/30,
             Percent_Below20 = sum(se <= cutoffs1[[i]][3])/30,
             Percent_Below30 = sum(se <= cutoffs1[[i]][4])/30,
             Percent_Below40 = sum(se <= cutoffs1[[i]][5])/30,
             Percent_Below50 = sum(se <= cutoffs1[[i]][6])/30, 
             Percent_Below60 = sum(se <= cutoffs1[[i]][7])/30, 
             Percent_Below70 = sum(se <= cutoffs1[[i]][8])/30, 
             Percent_Below80 = sum(se <= cutoffs1[[i]][9])/30, 
             Percent_Below90 = sum(se <= cutoffs1[[i]][10])/30) %>% 
    mutate(original_n = sizes[i])
  
  summary_list2[[i]] <- sampled_values2[[i]] %>% 
    pivot_longer(cols = -c(sample_size)) %>% 
    rename(item = name, se = value) %>% 
    group_by(sample_size) %>% 
    summarize(Percent_Below0 = sum(se <= cutoffs2[[i]][1])/30,
             Percent_Below10 = sum(se <= cutoffs2[[i]][2])/30,
             Percent_Below20 = sum(se <= cutoffs2[[i]][3])/30,
             Percent_Below30 = sum(se <= cutoffs2[[i]][4])/30,
             Percent_Below40 = sum(se <= cutoffs2[[i]][5])/30,
             Percent_Below50 = sum(se <= cutoffs2[[i]][6])/30, 
             Percent_Below60 = sum(se <= cutoffs2[[i]][7])/30, 
             Percent_Below70 = sum(se <= cutoffs2[[i]][8])/30, 
             Percent_Below80 = sum(se <= cutoffs2[[i]][9])/30, 
             Percent_Below90 = sum(se <= cutoffs2[[i]][10])/30) %>% 
    mutate(original_n = sizes[i])
  
  summary_list3[[i]] <- sampled_values3[[i]] %>% 
    pivot_longer(cols = -c(sample_size)) %>% 
    rename(item = name, se = value) %>% 
    group_by(sample_size) %>% 
    summarize(Percent_Below0 = sum(se <= cutoffs3[[i]][1])/30,
             Percent_Below10 = sum(se <= cutoffs3[[i]][2])/30,
             Percent_Below20 = sum(se <= cutoffs3[[i]][3])/30,
             Percent_Below30 = sum(se <= cutoffs3[[i]][4])/30,
             Percent_Below40 = sum(se <= cutoffs3[[i]][5])/30,
             Percent_Below50 = sum(se <= cutoffs3[[i]][6])/30, 
             Percent_Below60 = sum(se <= cutoffs3[[i]][7])/30, 
             Percent_Below70 = sum(se <= cutoffs3[[i]][8])/30, 
             Percent_Below80 = sum(se <= cutoffs3[[i]][9])/30, 
             Percent_Below90 = sum(se <= cutoffs3[[i]][10])/30) %>% 
    mutate(original_n = sizes[i])
}

summary_DF <- bind_rows(summary_list1, 
                        summary_list2, 
                        summary_list3)
summary_DF$source <- rep(c("low", "med", "high"), each = length(sampled_values1)*length(samplesize_values))

From this data, we pinpoint the smallest suggested sample size at which 80%, 85%, 90%, and 95% of the items fall below the cutoff criterion. These values were chosen as popular measures of “power” in which one could determine the minimum suggested sample size (potentially 80% of items) and the maximum suggested sample size (potentially 90%).

summary_long_80 <- summary_DF %>% 
  pivot_longer(cols = -c(sample_size, original_n, source)) %>% 
  filter(value >= .80) %>% 
  arrange(sample_size, original_n, source, name) %>% 
  group_by(original_n, name, source) %>% 
  slice_head(n = 1) %>% 
  mutate(power = 80)

summary_long_85 <- summary_DF %>% 
  pivot_longer(cols = -c(sample_size, original_n, source)) %>% 
  filter(value >= .85) %>% 
  arrange(sample_size, original_n, source, name) %>% 
  group_by(original_n, name, source) %>% 
  slice_head(n = 1) %>% 
  mutate(power = 85)
  
summary_long_90 <- summary_DF %>% 
  pivot_longer(cols = -c(sample_size, original_n, source)) %>% 
  filter(value >= .90) %>% 
  arrange(sample_size, original_n, source, name) %>% 
  group_by(original_n, name, source) %>% 
  slice_head(n = 1) %>% 
  mutate(power = 90)

summary_long_95 <- summary_DF %>% 
  pivot_longer(cols = -c(sample_size, original_n, source)) %>% 
  filter(value >= .95) %>% 
  arrange(sample_size, original_n, source, name) %>% 
  group_by(original_n, name, source) %>% 
  slice_head(n = 1) %>% 
  mutate(power = 95)

summary_long <- rbind(summary_long_80, 
                      summary_long_85,
                      summary_long_90,
                      summary_long_95)

summary_long$source <- factor(summary_long$source, 
                              levels = c("low", "med", "high"),
                              labels = c("Low", "Medium", "High"))
summary_long$sd_original <- .2
summary_long$sd_original[summary_long$source == "Medium"] <- .4
summary_long$sd_original[summary_long$source == "High"] <- .8
summary_long$name <- gsub("Percent_Below", "Decile ", summary_long$name)

Results

Differences in Item Variance

We examined if this procedure is sensitive to differences in item heterogeneity, as we should expect to collect larger samples if we wish to have a large number of items reach a threshold of acceptable variance; potentially, assuring we could average them if a researcher did not wish to use a more complex analysis such as multilevel modeling.

The figure below illustrates the potential minimum sample size for 80% of items to achieve a desired cutoff score. The black dots denote the original sample size against the suggested sample size. By comparing the facets, we can determine that our suggested procedure does capture the differences in heterogeneity. As heterogeneity increases in item variances, the proposed sample size also increases, especially at stricter cutoffs. Missing cutoff points where sample sizes proposed would be higher than 500.

ggplot(summary_long %>% filter(power == 80), aes(original_n, sample_size, color = name)) + 
  geom_point() +  
  geom_point(aes(original_n, original_n), color = "black") + 
  geom_line() + 
  theme_classic() + 
  facet_wrap(~ source) + 
  xlab("Original Sample Size") + 
  ylab("Suggested Sample Size") + 
  scale_color_discrete(name = "Cutoff Score")

Sample Size Sensitivity to Pilot Data Size

In our second question, we examined if the suggested procedure was sensitive to the amount of information present in the pilot data. Larger pilot data is more informative, and therefore, we should expect a lower projected sample size. As shown in the figure below for only the low variability data, we do not find this effect. These simplistic simulations from the pilot data would nearly always suggest a larger sample size - mostly in a linear trend increasing with sample sizes. This result comes from the nature of the procedure - if we base our estimates on some SE cutoff, we will almost always need a bit more people for items to meet those goals. This result does not achieve our second goal.

ggplot(summary_long %>% filter(source == "Low"), aes(original_n, sample_size, color = name)) + 
  geom_point() +  
  geom_point(aes(original_n, original_n), color = "black") + 
  geom_line() + 
  theme_classic() + 
  facet_wrap(~ power) + 
  xlab("Original Sample Size") + 
  ylab("Suggested Sample Size") + 
  scale_color_discrete(name = "Cutoff Score")

Therefore, we suggest using a correction factor on the simulation procedure to account for the known asymptotic nature of power (i.e., at larger sample sizes power increases level off). For this function, we combined a correction factor for upward biasing of effect sizes (Hedges’ correction) with the formula for exponential decay calculations. The decay factor is calculated as follows:

\[ 1 - \sqrt{\frac{N_{pilot} - min(N_{sim})}{N_{pilot}}}^{log_2(N_{pilot})}\] \(N_{pilot}\) indicates the sample size of the pilot data minus the minimum sample size for simulation to ensure that the smallest sample sizes do not decay (e.g., the formula zeroes out). This value is raised to the power of \(log_2\) of the sample size of the pilot data, which decreases the impact of the decay to smaller increments for increasing sample sizes. This value is then multiplied by the proposed sample size. As show in the figure below, this correction factor produces the dsired quality of maintaining that small pilot studies should increase sample size, and that sample size suggestions level off as pilot study data sample size increases.

decay <- 1-sqrt((summary_long$original_n-20)/summary_long$original_n)^log2(summary_long$original_n)

summary_long$new_sample <- summary_long$sample_size*decay

ggplot(summary_long %>% filter(source == "Low"), aes(original_n, new_sample, color = name)) + 
  geom_point() +  
  geom_point(aes(original_n, original_n), color = "black") + 
  geom_line() + 
  theme_classic() + 
  facet_wrap(~ power) + 
  xlab("Original Sample Size") + 
  ylab("Suggested Sample Size") + 
  scale_color_discrete(name = "Cutoff Score")

Corrections for Individual Researchers

We have portrayed that this procedure, with a correction factor, can perform as desired. However, within real scenarios, researchers will only have one pilot sample, not many as show here. What should the researcher do to correct their sample size on their own pilot data?

To explore if we could recover the new suggested sample size from data a researcher would have, we used linear models to create a formula for calculation. First, the corrected sample size was predicted by the original suggested sample size. Next, the standard deviation of the item standard deviations was added to the equation. Last, we included the pilot sample size.

user_model <- lm(new_sample ~ sample_size, data = summary_long)
user_print <- apa_print(user_model)

user_model2 <- lm(new_sample ~ sample_size + sd_original, data = summary_long)
user_print2 <- apa_print(user_model2)

user_model3 <- lm(new_sample ~ sample_size + sd_original + original_n, data = summary_long)
user_print3 <- apa_print(user_model3)

change_table <- tidy(anova(user_model, user_model2, user_model3))

The first model using original sample size to predict new sample size was significant, \(R^2 = .92\), 90% CI \([0.92, 0.93]\), \(F(1, 1,869) = 22,165.05\), \(p < .001\), capturing nearly 90% of the variance. The second model with item standard deviation was better then the first model F(1, 1868) = 1.4457031, p < .001, \(R^2 = .92\), 90% CI \([0.92, 0.93]\). The addition of the original pilot sample size was also significant, F(1, 1867) = 1848.4067115, p < .001, \(R^2 = .96\), 90% CI \([0.96, 0.96]\).

As shown in the table below, the new suggested sample size is proportional to the original suggested sample size (i.e., b < 1), which reduces the sample size suggestion. As variability increases, the suggested sample size also increases to capture differences in heterogeneity shown above. Last, in order to correct for large pilot data, the original pilot sample size decreases the new suggested sample size. This formula approximation captures 96% of the variance in sample size scores and should allow a researcher to estimate based on their own data.

apa_table(tidy(user_model3))
(#tab:unnamed-chunk-12)
**
term estimate std.error statistic p.value
(Intercept) 65.19 2.00 32.65 0.00
sample_size 0.70 0.00 212.34 0.00
sd_original -0.83 2.55 -0.33 0.74
original_n -1.14 0.03 -42.99 0.00

Choosing an Appropriate Cutoff

by_cutoff <- list()
R2 <- list()

for (cutoff in unique(summary_long$name)){
  by_cutoff[[cutoff]] <- lm(new_sample ~ sample_size + sd_original + original_n, data = summary_long  %>% filter(name == cutoff))
  R2[cutoff] <- summary(by_cutoff[[cutoff]])$r.squared
}

R2
## $`Decile 0`
## [1] 0.9812539
## 
## $`Decile 10`
## [1] 0.9665002
## 
## $`Decile 20`
## [1] 0.9727879
## 
## $`Decile 30`
## [1] 0.9723443
## 
## $`Decile 40`
## [1] 0.9685064
## 
## $`Decile 50`
## [1] 0.9658231
## 
## $`Decile 60`
## [1] 0.9675184
## 
## $`Decile 70`
## [1] 0.9686524
## 
## $`Decile 80`
## [1] 0.9681113
## 
## $`Decile 90`
## [1] 0.9677101

Last, we examine the question of an appropriate SE decile. All graphs for power, variability, and correction are presented below. If we examine the \(R^2\) values for each decile of our regression equation separately, we find that the 10% (0.967) and 30% deciles (0.972) represent the best match to our corrected sample size suggestions. The 10% is likely unfeasible for many researchers, as the sample sizes are often quite large. The 30% decile, in the corrected format, appears to meet all goals: 1) increases with heterogeneity, 2) higher suggested values for lower samples and a leveling effect at larger pilot data, and …

Low Variability

ggplot(summary_long %>% filter(source == "Low"), aes(original_n, sample_size, color = name)) + 
  geom_point() +  
  geom_point(aes(original_n, original_n), color = "black") + 
  geom_line() + 
  theme_classic() + 
  facet_wrap(~ power) + 
  xlab("Original Sample Size") + 
  ylab("Suggested Sample Size") + 
  scale_color_discrete(name = "Cutoff Score")

ggplot(summary_long %>% filter(source == "Low"), aes(original_n, new_sample, color = name)) + 
  geom_point() +  
  geom_point(aes(original_n, original_n), color = "black") + 
  geom_line() + 
  theme_classic() + 
  facet_wrap(~ power) + 
  xlab("Original Sample Size") + 
  ylab("Suggested Sample Size") + 
  scale_color_discrete(name = "Cutoff Score")

Medium Varability

ggplot(summary_long %>% filter(source == "Medium"), aes(original_n, sample_size, color = name)) + 
  geom_point() +  
  geom_point(aes(original_n, original_n), color = "black") + 
  geom_line() + 
  theme_classic() + 
  facet_wrap(~ power) + 
  xlab("Original Sample Size") + 
  ylab("Suggested Sample Size") + 
  scale_color_discrete(name = "Cutoff Score")

summary_long$new_sample <- summary_long$sample_size*decay

ggplot(summary_long %>% filter(source == "Medium"), aes(original_n, new_sample, color = name)) + 
  geom_point() +  
  geom_point(aes(original_n, original_n), color = "black") + 
  geom_line() + 
  theme_classic() + 
  facet_wrap(~ power) + 
  xlab("Original Sample Size") + 
  ylab("Suggested Sample Size") + 
  scale_color_discrete(name = "Cutoff Score")

High Variability

ggplot(summary_long %>% filter(source == "High"), aes(original_n, sample_size, color = name)) + 
  geom_point() +  
  geom_point(aes(original_n, original_n), color = "black") + 
  geom_line() + 
  theme_classic() + 
  facet_wrap(~ power) + 
  xlab("Original Sample Size") + 
  ylab("Suggested Sample Size") + 
  scale_color_discrete(name = "Cutoff Score")

ggplot(summary_long %>% filter(source == "High"), aes(original_n, new_sample, color = name)) + 
  geom_point() +  
  geom_point(aes(original_n, original_n), color = "black") + 
  geom_line() + 
  theme_classic() + 
  facet_wrap(~ power) + 
  xlab("Original Sample Size") + 
  ylab("Suggested Sample Size") + 
  scale_color_discrete(name = "Cutoff Score")